1 Setup

  • Libraries
suppressPackageStartupMessages({
    library(data.table)
    library(DESeq2)
    library(gplots)
    library(here)
    library(hyperSpec)
    library(RColorBrewer)
    library(tidyverse)
    library(VennDiagram)
    library(emoji)
    library(ggVennDiagram)
})
  • Helper files
suppressMessages({
    source(here("UPSCb-common/Rtoolbox/src/plotEnrichedTreemap.R"))
    source(here("UPSCb-common/src/R/featureSelection.R"))
    source(here("UPSCb-common/src/R/volcanoPlot.R"))
    source(here("UPSCb-common/src/R/gopher.R"))
})
  • Graphics
pal=brewer.pal(8,"Dark2")
hpal <- colorRampPalette(c("blue","white","red"))(100)
mar <- par("mar")
  • Functions
  1. plot specific gene expression
"line_plot" <- function(dds=dds,vst=vst,gene_id=gene_id){
    message(paste("Plotting",gene_id))
    sel <- grepl(gene_id,rownames(vst))
    stopifnot(sum(sel)==1)

    p <- ggplot(bind_cols(as.data.frame(colData(dds)),
                          data.frame(value=vst[sel,])),
                aes(x=Genotype,y=value,fill=Genotype)) +
        geom_boxplot() +
        geom_jitter(color="black") +
        scale_y_continuous(name="VST expression") + 
        ggtitle(label=paste("Expression for: ",gene_id))
    
    suppressMessages(suppressWarnings(plot(p)))
    return(NULL)
}
  1. extract the DE results. Default cutoffs are from Schurch et al., RNA, 2016
"extract_results" <- function(dds,vst,contrast,
                              padj=0.01,lfc=0.5,
                              plot=TRUE,verbose=TRUE,
                              export=TRUE,default_dir=here("data/analysis/DE"),
                              default_prefix="DE-",
                              labels=colnames(dds),
                              sample_sel=1:ncol(dds),
                              expression_cutoff=0,
                              debug=FALSE,filter=c("median",NULL),...){
    
    # get the filter
    if(!is.null(match.arg(filter))){
        filter <- rowMedians(counts(dds,normalized=TRUE))
        message("Using the median normalized counts as default, set filter=NULL to revert to using the mean")
    }
    
    # validation
    if(length(contrast)==1){
        res <- results(dds,name=contrast,filter = filter)
    } else {
        res <- results(dds,contrast=contrast,filter = filter)
    }
    
    stopifnot(length(sample_sel)==ncol(vst))
    
    if(plot){
        par(mar=c(5,5,5,5))
        volcanoPlot(res)
        par(mar=mar)
    }
    
    # a look at independent filtering
    if(plot){
        plot(metadata(res)$filterNumRej,
             type="b", ylab="number of rejections",
             xlab="quantiles of filter")
        lines(metadata(res)$lo.fit, col="red")
        abline(v=metadata(res)$filterTheta)
    }
    
    if(verbose){
        message(sprintf("The independent filtering cutoff is %s, removing %s of the data",
                        round(metadata(res)$filterThreshold,digits=5),
                        names(metadata(res)$filterThreshold)))
        
        max.theta <- metadata(res)$filterNumRej[which.max(metadata(res)$filterNumRej$numRej),"theta"]
        message(sprintf("The independent filtering maximises for %s %% of the data, corresponding to a base mean expression of %s (library-size normalised read)",
                        round(max.theta*100,digits=5),
                        round(quantile(counts(dds,normalized=TRUE),probs=max.theta),digits=5)))
    }
    
    if(plot){
        qtl.exp=quantile(counts(dds,normalized=TRUE),probs=metadata(res)$filterNumRej$theta)
        dat <- data.frame(thetas=metadata(res)$filterNumRej$theta,
                          qtl.exp=qtl.exp,
                          number.degs=sapply(lapply(qtl.exp,function(qe){
                              res$padj <= padj & abs(res$log2FoldChange) >= lfc & 
                                  ! is.na(res$padj) & res$baseMean >= qe
                          }),sum))
        if(debug){
            plot(ggplot(dat,aes(x=thetas,y=qtl.exp)) + 
                     geom_line() + geom_point() +
                     scale_x_continuous("quantiles of expression") + 
                     scale_y_continuous("base mean expression") +
                     geom_hline(yintercept=expression_cutoff,
                                linetype="dotted",col="red"))
        
            p <- ggplot(dat,aes(x=thetas,y=qtl.exp)) + 
                geom_line() + geom_point() +
                scale_x_continuous("quantiles of expression") + 
                scale_y_log10("base mean expression") + 
                geom_hline(yintercept=expression_cutoff,
                           linetype="dotted",col="red")
            suppressMessages(suppressWarnings(plot(p)))
            
            plot(ggplot(dat,aes(x=thetas,y=number.degs)) + 
                     geom_line() + geom_point() +
                     geom_hline(yintercept=dat$number.degs[1],linetype="dashed") +
                     scale_x_continuous("quantiles of expression") + 
                     scale_y_continuous("Number of DE genes"))
            
            plot(ggplot(dat,aes(x=thetas,y=number.degs[1] - number.degs),aes()) + 
                     geom_line() + geom_point() +
                     scale_x_continuous("quantiles of expression") + 
                     scale_y_continuous("Cumulative number of DE genes"))
            
            plot(ggplot(data.frame(x=dat$thetas[-1],
                                   y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) + 
                     geom_line() + geom_point() +
                     scale_x_continuous("quantiles of expression") + 
                     scale_y_continuous("Number of DE genes per interval"))
            
            plot(ggplot(data.frame(x=dat$qtl.exp[-1],
                                   y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) + 
                     geom_line() + geom_point() +
                     scale_x_continuous("base mean of expression") + 
                     scale_y_continuous("Number of DE genes per interval"))
            
            p <- ggplot(data.frame(x=dat$qtl.exp[-1],
                                   y=diff(dat$number.degs[1] - dat$number.degs)),aes(x,y)) + 
                geom_line() + geom_point() +
                scale_x_log10("base mean of expression") + 
                scale_y_continuous("Number of DE genes per interval") + 
                geom_vline(xintercept=expression_cutoff,
                           linetype="dotted",col="red")
            suppressMessages(suppressWarnings(plot(p)))
        }
    }
    
    sel <- res$padj <= padj & abs(res$log2FoldChange) >= lfc & ! is.na(res$padj) & 
        res$baseMean >= expression_cutoff
    
    if(verbose){
      message(sprintf(paste(
        ifelse(sum(sel)==1,
               "There is %s gene that is DE",
               "There are %s genes that are DE"),
        "with the following parameters: FDR <= %s, |log2FC| >= %s, base mean expression > %s"),
        sum(sel),padj,
        lfc,expression_cutoff))
    }
    
    # proceed only if there are DE genes
    if(sum(sel) > 0){
        val <- rowSums(vst[sel,sample_sel,drop=FALSE])==0
        if (sum(val) >0){
          warning(sprintf(paste(
            ifelse(sum(val)==1,
                   "There is %s DE gene that has",
                   "There are %s DE genes that have"),
            "no vst expression in the selected samples"),sum(val)))
          sel[sel][val] <- FALSE
        } 

        if(export){
            if(!dir.exists(default_dir)){
                dir.create(default_dir,showWarnings=FALSE,recursive=TRUE,mode="0771")
            }
            write.csv(res,file=file.path(default_dir,paste0(default_prefix,"results.csv")))
            write.csv(res[sel,],file.path(default_dir,paste0(default_prefix,"genes.csv")))
        }
        if(plot & sum(sel)>1){
            heatmap.2(t(scale(t(vst[sel,sample_sel]))),
                      distfun = pearson.dist,
                      hclustfun = function(X){hclust(X,method="ward.D2")},
                      trace="none",col=hpal,labRow = FALSE,
                      labCol=labels[sample_sel],...
            )
        }
    }
    return(list(all=rownames(res[sel,]),
                up=rownames(res[sel & res$log2FoldChange > 0,]),
                dn=rownames(res[sel & res$log2FoldChange < 0,])))
}
  1. extract and plot the enrichment results
extractEnrichmentResults <- function(enrichment,task="go",
                                     diff.exp=c("all","up","dn"),
                                     go.namespace=c("BP","CC","MF"),
                                     genes=NULL,export=TRUE,plot=TRUE,
                                     default_dir=here("data/analysis/DE"),
                                     default_prefix="DE",
                                     url="athaliana"){
    # process args
    diff.exp <- match.arg(diff.exp)
    de <- ifelse(diff.exp=="all","none",
                 ifelse(diff.exp=="dn","down",diff.exp))

    # sanity
    if( is.null(enrichment[[task]]) | length(enrichment[[task]]) == 0){
        message(paste("No enrichment for",task))
    } else {

        # write out
        if(export){
            write_tsv(enrichment[[task]],
                      file=here(default_dir,
                                paste0(default_prefix,"-genes_GO-enrichment.tsv")))
            if(!is.null(genes)){
                write_tsv(
                    enrichedTermToGenes(genes=genes,terms=enrichment[[task]]$id,url=url,mc.cores=16L),
                    file=here(default_dir,
                              paste0(default_prefix,"-enriched-term-to-genes.tsv"))
                )
            }
        }
        
        if(plot){
            sapply(go.namespace,function(ns){
                titles <- c(BP="Biological Process",
                            CC="Cellular Component",
                            MF="Molecular Function")
                suppressWarnings(tryCatch({plotEnrichedTreemap(enrichment,enrichment=task,
                                                               namespace=ns,
                                                               de=de,title=paste(default_prefix,titles[ns]))},
                                          error = function(e) {
                                              message(paste("Treemap plot failed for",ns, 
                                                            "because of:",e))
                                              return(NULL)
                                          }))
            })
        }
    }
}
  • Data
load(here("analysis/salmon/dds_mergeTechRep.rda"))
dds$Genotype <- relevel(dds$Genotype,ref = "T89")

1.1 Normalisation for visualisation

vsd <- varianceStabilizingTransformation(dds,blind=FALSE)
## using 'avgTxLength' from assays(dds), correcting for library size
vst <- assay(vsd)
vst <- vst - min(vst)
dir.create(here("analysis/DE"),showWarnings=FALSE)
save(vst,file=here("analysis/DE/vst-aware.rda"))
write_delim(as.data.frame(vst) %>% rownames_to_column("ID"),
            here("analysis/DE/vst-aware.tsv"))

1.2 Gene of interests

1.2.1 Knocked-out gene

The targeted locus for line 26 is Potri.006G174000/Potra001877g14982 (Potra2n6c13821) And for the line 03, it is Potri.018G096200/Potra001273g10998 (Potra2n6c13821) This is the expression level of targeted gene.

kogene <- "Potra2n6c13821"
stopifnot(kogene %in% rownames(vst))
line_plot(dds=dds,vst=vst,gene_id = kogene)
## Plotting Potra2n6c13821

## NULL

1.2.2 Other SM and LSM

suppressMessages(goi <- read_delim(here("doc/goi.txt")))
knitr::kable(goi)
Name V2
SNRPG Potra2n6c13472
SNRPB/N Potra2n11c22477
SNRPD1 Potra2n5c10994
SNRPD2 Potra2n14c27408
SNRPD3 Potra2n2c6364
SNRPF Potra2n8c17320
SNRPE Potra2n6c13821
LSM1A-B Potra2n1c1466
LSM2 Potra2n4c10338
LSM3A-B Potra2n5c11140
LSM4 Potra2n2c4537
LSM6A-B Potra2n8c17320
LSM7 Potra2n1c2324
LSM8 Potra2n1c2419
genelist <- goi$V2[!(goi$V2 == "Potra2n6c13821")]
stopifnot(all(genelist %in% rownames(vst)))
dev.null <- lapply(genelist,line_plot,dds=dds,vst=vst)
## Plotting Potra2n6c13472
## Plotting Potra2n11c22477

## Plotting Potra2n5c10994

## Plotting Potra2n14c27408

## Plotting Potra2n2c6364

## Plotting Potra2n8c17320

## Plotting Potra2n1c1466

## Plotting Potra2n4c10338

## Plotting Potra2n5c11140

## Plotting Potra2n2c4537

## Plotting Potra2n8c17320

## Plotting Potra2n1c2324

## Plotting Potra2n1c2419

1.3 Differential Expression

dds <- DESeq(dds)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
  • Dispersion estimation The dispersion estimation is adequate
plotDispEsts(dds)

Check the different contrasts

resultsNames(dds)
## [1] "Intercept"              "Genotype_line26_vs_T89" "Genotype_line3_vs_T89"

1.4 Results

1.4.0.1 Line 26

line26 <- extract_results(dds=dds,vst=vst,contrast="Genotype_line26_vs_T89",
                                                        default_dir = here("analysis/DE"),
                                                        default_prefix = "Line26_",
                                                        labels=dds$BioID,
                                                        sample_sel = dds$Genotype %in% c("line26","T89"), 
                                                        cexCol=.9, plot = TRUE, verbose = TRUE)
## Using the median normalized counts as default, set filter=NULL to revert to using the mean

## The independent filtering cutoff is 0.14916, removing 28.82535% of the data
## The independent filtering maximises for 28.82535 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 8765 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0

1.4.0.2 Line 03

line03 <- extract_results(dds=dds,vst=vst,contrast="Genotype_line3_vs_T89",
                                                         default_dir = here("analysis/DE"),
                                                         default_prefix = "Line03_",
                                                         labels=dds$BioID,
                                                         sample_sel = dds$Genotype %in% c("line3","T89"), 
                                                         cexCol=.9, plot = TRUE, verbose = TRUE)
## Using the median normalized counts as default, set filter=NULL to revert to using the mean

## The independent filtering cutoff is 0.14916, removing 28.82535% of the data
## The independent filtering maximises for 28.82535 % of the data, corresponding to a base mean expression of 0 (library-size normalised read)
## There are 8064 genes that are DE with the following parameters: FDR <= 0.01, |log2FC| >= 0.5, base mean expression > 0

1.4.1 Venn Diagram

res.list <- list("Line 26"=line26,
                                 "Line 03"=line03)

1.4.1.1 All DE genes

ggVennDiagram(lapply(res.list,"[[","all"), label_alpha = 0, label = "count") + 
    scale_fill_gradient(low="white",high = "dodgerblue") + 
    scale_x_continuous(expand = expansion(mult = .1))

1.4.1.2 Up-regulated DE genes (up in treated)

ggVennDiagram(lapply(res.list,"[[","up"), label_alpha = 0, label = "count") + 
    scale_fill_gradient(low="white",high = "dodgerblue") + 
    scale_x_continuous(expand = expansion(mult = .1))

1.4.1.3 Down-regulated DE genes (down in treated)

ggVennDiagram(lapply(res.list,"[[","dn"), label_alpha = 0, label = "count") + 
    scale_fill_gradient(low="white",high = "dodgerblue") + 
    scale_x_continuous(expand = expansion(mult = .1))

1.4.2 Gene Ontology enrichment

background <- rownames(vst)[featureSelect(vst,dds$Genotype,exp=0.00001)]

1.4.2.1 DEGs from Line26

res.list <- list("Line 26"=line26)

# enr.list <- lapply(res.list,function(r){
#   lapply(r,gopher,background=background,task="go",url="potra2")
# })
# 
# dev.null <- lapply(names(enr.list),function(n){
#   lapply(names(enr.list[[n]]),function(de){
#       extractEnrichmentResults(enr.list[[n]][[de]],
#                                                        diff.exp=de,
#                                                        genes=res.list[[n]][[de]],
#                                                        default_prefix=paste(n,de,sep="-"),
#                                                        url="potra2")
#   })
# })

1.4.2.2 DEGs from Line03

res.list <- list("Line 03"=line03)

# enr.list <- lapply(res.list,function(r){
#   lapply(r,gopher,background=background,task="go",url="potra2")
# })
# 
# dev.null <- lapply(names(enr.list),function(n){
#   lapply(names(enr.list[[n]]),function(de){
#       extractEnrichmentResults(enr.list[[n]][[de]],
#                                                        diff.exp=de,
#                                                        genes=res.list[[n]][[de]],
#                                                        default_prefix=paste(n,de,sep="-"),
#                                                        url="potra2")
#   })
# })

1.4.2.3 Common DEGs between two lines

res.list <- list("CommonLine26andLine03"=list("all"=intersect(line26$all,line03$all),
                                                                                            "up"=intersect(line26$up,line03$up),
                                                                                            "dn"=intersect(line26$dn,line03$dn)))

# enr.list <- lapply(res.list,function(r){
#   lapply(r,gopher,background=background,task="go",url="potra2")
# })
# 
# dev.null <- lapply(names(enr.list),function(n){
#   lapply(names(enr.list[[n]]),function(de){
#       extractEnrichmentResults(enr.list[[n]][[de]],
#                                                        diff.exp=de,
#                                                        genes=res.list[[n]][[de]],
#                                                        default_prefix=paste(n,de,sep="-"),
#                                                        url="potra2")
#   })
# })

2 Session Info

## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so;  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Etc/UTC
## tzcode source: system (glibc)
## 
## attached base packages:
##  [1] parallel  grid      stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] jsonlite_1.8.7              ggVennDiagram_1.2.3         LSD_4.1-0                   vsn_3.70.0                 
##  [5] tximport_1.30.0             pvclust_2.2-0               plotly_4.10.3               pheatmap_1.0.12            
##  [9] PCAtools_2.14.0             ggrepel_0.9.4               patchwork_1.1.3             magrittr_2.0.3             
## [13] gtools_3.9.4                emoji_15.0                  limma_3.58.1                VennDiagram_1.7.3          
## [17] futile.logger_1.4.3         lubridate_1.9.3             forcats_1.0.0               stringr_1.5.0              
## [21] dplyr_1.1.3                 purrr_1.0.2                 readr_2.1.4                 tidyr_1.3.0                
## [25] tibble_3.2.1                tidyverse_2.0.0             RColorBrewer_1.1-3          hyperSpec_0.100.0          
## [29] xml2_1.3.5                  ggplot2_3.4.4               lattice_0.21-8              here_1.0.1                 
## [33] gplots_3.1.3                DESeq2_1.42.0               SummarizedExperiment_1.32.0 Biobase_2.62.0             
## [37] MatrixGenerics_1.14.0       matrixStats_1.1.0           GenomicRanges_1.54.1        GenomeInfoDb_1.38.0        
## [41] IRanges_2.36.0              S4Vectors_0.40.1            BiocGenerics_0.48.1         data.table_1.14.8          
## 
## loaded via a namespace (and not attached):
##  [1] rstudioapi_0.15.0         rmarkdown_2.25            farver_2.1.1              zlibbioc_1.48.0          
##  [5] vctrs_0.6.4               DelayedMatrixStats_1.24.0 RCurl_1.98-1.13           htmltools_0.5.7          
##  [9] S4Arrays_1.2.0            curl_5.1.0                lambda.r_1.2.4            SparseArray_1.2.1        
## [13] sass_0.4.7                bslib_0.5.1               KernSmooth_2.23-22        htmlwidgets_1.6.2        
## [17] plyr_1.8.9                testthat_3.2.0            cachem_1.0.8              futile.options_1.0.1     
## [21] lifecycle_1.0.4           pkgconfig_2.0.3           rsvd_1.0.5                Matrix_1.6-0             
## [25] R6_2.5.1                  fastmap_1.1.1             GenomeInfoDbData_1.2.11   digest_0.6.33            
## [29] colorspace_2.1-0          rprojroot_2.0.4           dqrng_0.3.1               irlba_2.3.5.1            
## [33] beachmat_2.18.0           labeling_0.4.3            fansi_1.0.5               timechange_0.2.0         
## [37] httr_1.4.7                abind_1.4-5               compiler_4.3.1            proxy_0.4-27             
## [41] bit64_4.0.5               withr_2.5.2               BiocParallel_1.36.0       DBI_1.1.3                
## [45] highr_0.10                DelayedArray_0.28.0       classInt_0.4-10           caTools_1.18.2           
## [49] units_0.8-4               tools_4.3.1               glue_1.6.2                sf_1.0-14                
## [53] reshape2_1.4.4            generics_0.1.3            gtable_0.3.4              tzdb_0.4.0               
## [57] class_7.3-22              preprocessCore_1.64.0     hms_1.1.3                 BiocSingular_1.18.0      
## [61] ScaledMatrix_1.10.0       utf8_1.2.4                XVector_0.42.0            pillar_1.9.0             
## [65] vroom_1.6.4               bit_4.0.5                 deldir_1.0-9              tidyselect_1.2.0         
## [69] locfit_1.5-9.8            knitr_1.45                xfun_0.41                 statmod_1.5.0            
## [73] brio_1.1.3                stringi_1.7.12            yaml_2.3.7                lazyeval_0.2.2           
## [77] evaluate_0.23             codetools_0.2-19          interp_1.1-4              BiocManager_1.30.22      
## [81] RVenn_1.1.0               cli_3.6.1                 affyio_1.72.0             jquerylib_0.1.4          
## [85] munsell_0.5.0             Rcpp_1.0.11               png_0.1-8                 latticeExtra_0.6-30      
## [89] jpeg_0.1-10               sparseMatrixStats_1.14.0  bitops_1.0-7              viridisLite_0.4.2        
## [93] e1071_1.7-13              scales_1.2.1              affy_1.80.0               crayon_1.5.2             
## [97] rlang_1.1.2               cowplot_1.1.1             formatR_1.14
 

drawing

Created by Fai Theerarat Kochakarn

theerarat.kochakarn@umu.se